from IPython.display import display, Image
display(Image('bigleaves.jpg', width = 800))
# library used
import operator
import os
import collections
from functools import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
names = [f for f in os.listdir('images') if f.endswith('.jpg')]
for name in names[:2]:
print('Leaf {}'.format(name.split('.')[0])) # number is leaf order
display(Image('images/' + name, width=300))
data = pd.read_csv(os.getcwd()+'/train.csv')
len(data.groupby('species')['id'].count()) # how many species
data.columns
data.info() # brief view
data.groupby('species')['margin1','shape1','texture1'].mean().head() # rougly looking inside
data.head() # look at first 5 obs
rand = np.random.RandomState(0) # set random seed
# get randomly sampled data index
fold = 10
fold_size = int(len(data.index)/fold)
ls = reduce(operator.add,[[i]*fold_size for i in range(fold)])
number = np.array(ls)
# random index preparing for selecting
index = rand.choice(number, replace = False, size = number.shape)
##calculate the correct prediction rates on test_leaf
#@param test_leaf, test data set
#@param train, trainning data set
#@param k, k nearest neighbors
#@return return 1 correct rates on one of folder
def kernal(test_leaf, train,k):
train_data = train.iloc[:,2:]
train_label = train['species']
''' have to reindex it
using np.array to solve the bug, otherwise it will use previous index when we %train_label[order]
you can also use train_label.iloc[order] to solve it. implicit reference
'''
train_label = np.array(train_label)
total = 0
length = len(test_leaf.index)
for i in range(length):
# a complete leaf including label
leaf = test_leaf.iloc[i,:]
leaf_label = leaf['species']
leaf_data = leaf['margin1':]
# using euclidean to calculate the distance
sqr_diff = np.add.reduce((train_data-leaf_data)**2,axis = 1)
order = np.argsort(sqr_diff)[:k]
labels = train_label[order]
c = collections.Counter(labels)
#for key, value in c.most_common(1):
#print(key)
pred_label = c.most_common(1)[0][0]
if (pred_label == leaf_label):
total += 1
#print(total/length)
return(total/length)
##calculate the overall correct prediction rates
#@param k, k nearest neighbors
#@param data, original data
#@param index, randomly index in order for 10 folder cross validation
#@return average correct prediction rates over 10 folders
def KNN(k,data, index):
correct = []
for i in range(10):
valid_set = data[index == i]
train_set = data[index != i]
correct_rate = kernal(valid_set, train_set,k)
correct.append(correct_rate)
return sum(correct)/len(correct)
knn = []
for i in range(1,10):
value = KNN(i,data,index)
knn.append(value)
knn = np.array(knn)
np.savetxt('knn_test.txt', knn, delimiter=',')
%matplotlib inline
plt.style.use('seaborn-whitegrid')
knn_data = np.loadtxt('knn_test.txt', delimiter = ',')
fig = plt.figure()
ax = plt.axes()
x = np.array(range(1,10))
ax.plot(x,knn_data,'co', x, knn_data,'r--')
ax.annotate('optimal maximum', xy=(1, knn_data[0]), xytext=(3, 0.92),
arrowprops=dict(arrowstyle="->", connectionstyle="angle3,angleA=0,angleB=-90"))
ax.xaxis.set_major_locator(plt.MaxNLocator(10)) # set how many ticks
around = np.around(knn_data, decimals = 3)
for i in range(len(knn_data)):
ax.text(i+1,knn_data[i]+0.005, around[i])
ax.set(xlim = (0,10), ylim = (0.75, 0.95),
xlabel = 'K Nearest Neighbors', ylabel = 'Correct Rate',
title='Correct prediction rate on validation dataset vs KNN')
plt.show()
# try to predict test data set using knn=1 method based on trainning data set
# but I do not know the result since it is protected by Kaggle website.
test = pd.read_csv('test.csv')
test.columns # to identify anything different
train_data = data.iloc[:,2:]
train_label = data['species'].values
ls = []
for i in range(len(test.index)):
leaf = test.iloc[i,1:]
sqr_diff = np.add.reduce((train_data-leaf)**2,axis = 1)
order = np.argsort(sqr_diff)[:1]
labels = train_label[order][0]
ls.append(labels)
label = np.array(ls)
index = np.array(range(1,len(test.index)+1))
a = list(data['species'].values) # try to get maximum length of string in species
maximum = max([len(i) for i in a])
output = np.zeros(label.size,dtype=[('var1', int), ('var2', 'U{}'.format(maximum))])
output['var1'] = index
output['var2'] = label
np.savetxt('knn_prediction_result_on_testDataSet.txt',output,delimiter = ',', fmt = '%3.i %10s')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
data = pd.read_csv(os.getcwd()+'/train.csv')
le = LabelEncoder()
le.fit(data.species.values)
labels = le.transform(data.species.values) # species label
data = data.drop(['id','species'], axis = 1)
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn
sss = StratifiedShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 0)
classifiers = [
KNeighborsClassifier(),
SVC(kernel="rbf", probability=True),
NuSVC(probability=True),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
Acc = []
Lloss = []
for train_index, test_index in sss.split(data, labels):
train_data = data.values[train_index]
test_data = data.values[test_index]
train_label = labels[train_index]
test_label = labels[test_index]
acc = []
lloss = []
for clf in classifiers:
clf.fit(train_data, train_label)
prediction = clf.predict(test_data)
score = accuracy_score(test_label, prediction) # accuracy score
pred = clf.predict_proba(test_data)
loss = log_loss(test_label, pred)#log loss
acc.append(score)
lloss.append(loss)
Acc.append(acc)
Lloss.append(lloss)
accuracy = np.add.reduce(Acc)/len(Acc)
logloss = np.add.reduce(Lloss)/len(Lloss)
np.savetxt('accuracy.txt', accuracy, delimiter=',')
np.savetxt('logloss.txt', logloss, delimiter=',')
accuracy = np.loadtxt('accuracy.txt', delimiter = ',')
logloss =np.loadtxt('logloss.txt', delimiter = ',')
classi = np.array(['KNN', 'SVC', 'NuSVC', 'DecisionTree', 'RandomForest', 'AdaBoost','GradientBoosting',
'GaussianNB', 'LinearDiscriminantAnalysis','QuadraticDiscriminantAnalysis'])
di = {'classifier': classi, 'accuracy' : accuracy, 'logloss' :logloss}
final = pd.DataFrame.from_dict(di)
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
sns.barplot(x='accuracy', y='classifier', data=final, color = 'g')
plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')
plt.show()
sns.barplot(x='logloss', y='classifier', data=final, hue = "classifier")
plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()
import random
random.seed(23)
lda = LinearDiscriminantAnalysis(n_components=2)
X_r = lda.fit(data.values, labels).transform(data.values)
names = le.classes_ # get the name tents
unique = len(set(labels)) # how many different species label
r = (0.0,0.25,0.5,0.75,1.0)
colors = [(i,j,k) for i in r for j in r for k in r] # get RGB combinations by nested list comprehension
# refulling 10 times
for i in range(10):
random.shuffle(colors)
%matplotlib inline
plt.figure()
for color, i,name in zip(colors, range(unique), names):
plt.scatter(X_r[labels == i, 0], X_r[labels == i, 1], color = color, alpha = 0.4, lw = 1,
label = name)
plt.show()
%matplotlib inline
counter = input('Please enter the integer stands for how many first classes displayed: ')
counter = int(counter)
for color, i,name in zip(colors, range(unique), names):
plt.scatter(X_r[labels == i, 0], X_r[labels == i, 1], color = color, alpha = 1, lw = 2,
label = name,marker = '*')
counter -= 1
if(counter == 0):
break
plt.legend(shadow=False, scatterpoints=1, fontsize = 'medium',loc='right',bbox_to_anchor=(1.5, 0.5))
plt.title('LDA of Leaf dataset')
plt.show()
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_r2 = pca.fit(data.values).transform(data.values)
%matplotlib inline
plt.figure()
counter = input('Please enter the integer stands for how many first classes displayed: ')
counter = int(counter)
for color, i, name in zip(colors, range(unique), names):
plt.scatter(X_r2[labels == i, 0], X_r2[labels == i, 1], color=color, alpha=1, lw=2,
label=name, marker = '+')
counter -= 1
if(counter == 0):
break
plt.legend(shadow=False, scatterpoints=1, fontsize = 'medium',loc='right',bbox_to_anchor=(1.5, 0.5))
plt.title('PCA of Leaf dataset')
plt.show()
test = pd.read_csv('test.csv')
test_data = test.iloc[:,1:].values
test_ids = test['id']
clf = LinearDiscriminantAnalysis()
clf.fit(data.values, labels)
result = clf.predict_proba(test_data)
df = pd.DataFrame(result,columns=le.classes_)
df.insert(0, 'id', test_ids)
df.to_csv('leaf_classifer_prob.csv', index = False)
df.head()